options(scipen=999)
seed = 2018
pkgs = c('data.table', 'magrittr', 'caret', 'glmnet', 'ranger', 'plotly', 'iml')
inst = lapply(pkgs, library, character.only=T, quietly=T); rm(pkgs, inst)
source('~/Documents/R/Utils/functions/summarize.R')
source('~/Documents/R/Utils/functions/one_hot.R')
source('~/Documents/R/Utils/functions/model_auc.R')
source('~/Documents/R/Utils/functions/feature_comparison.R')
source('~/Documents/R/Utils/functions/feature_distribution.R')
source('~/Documents/R/Utils/functions/helper.R')
day = fread("~/Rrrrr/projects/bike_sharing/day.csv") %>%
.[, dteday := as.Date(dteday)]
hour = fread("~/Rrrrr/projects/bike_sharing/hour.csv") %>%
.[, dteday := as.Date(dteday)]
head(day)The goal is to find variables with:
sum.day = Summarize(day); DT::datatable(sum.day, options = list(dom = "tip"))sum.hour = Summarize(hour); DT::datatable(sum.hour, options = list(dom = "tip"))day[,.N,.(weekday, workingday, holiday)]FeatureComparison(list(holiday = day[holiday==1, cnt],
weekday = day[holiday==0 & workingday==1, cnt],
weekend = day[holiday==0 & workingday==0, cnt]),
vectorName = "Total rental", sample = F, numerical = T)FeatureComparison(list(holiday = day[holiday==1, casual],
weekday = day[holiday==0 & workingday==1, casual],
weekend = day[holiday==0 & workingday==0, casual]),
vectorName = "Casual rental", sample = F, numerical = T)FeatureComparison(list(holiday = day[holiday==1, registered],
weekday = day[holiday==0 & workingday==1, registered],
weekend = day[holiday==0 & workingday==0, registered]),
vectorName = "Registered rental", sample = T, numerical = T)2012-10-29 & 2012-10-30 why is it so low?
FeatureDistribution(list(weekday = day[holiday==0 & workingday==1],
weekend = day[holiday==0 & workingday==0],
holiday = day[holiday==1], overall = day),
featureName = "cnt", periodName = "dteday")2011-07-04 is very high – 4th of July!FeatureDistribution(list(weekday = day[holiday==0 & workingday==1],
weekend = day[holiday==0 & workingday==0],
holiday = day[holiday==1], overall = day),
featureName = "casual", periodName = "dteday")FeatureDistribution(list(weekday = day[holiday==0 & workingday==1],
weekend = day[holiday==0 & workingday==0],
holiday = day[holiday==1], overall = day),
featureName = "registered", periodName = "dteday")FeatureDistribution(list(Clear = day[weathersit==1],
Mist = day[weathersit==2],
Light = day[weathersit==3]),
featureName = "casual", periodName = "mnth")FeatureDistribution(list(Clear = day[weathersit==1],
Mist = day[weathersit==2],
Light = day[weathersit==3]),
featureName = "registered", periodName = "mnth")FeatureComparison(list(S = day[weekday==0, casual],
M = day[weekday==1, casual],
T = day[weekday==2, casual],
W = day[weekday==3, casual],
Th = day[weekday==4, casual],
F = day[weekday==5, casual],
St = day[weekday==6, casual]),
vectorName = "Casual rental", sample = F, numerical = T)FeatureComparison(list(S = day[weekday==0, registered],
M = day[weekday==1, registered],
T = day[weekday==2, registered],
W = day[weekday==3, registered],
Th = day[weekday==4, registered],
F = day[weekday==5, registered],
St = day[weekday==6, registered]),
vectorName = "Registered rental", sample = F, numerical = T)How far the dot is from origin is the average number of users during that time
time = hour[workingday==1, .(day = "Weekday", c = mean(casual), r = mean(registered)), hr] %>%
rbind(hour[weekday==6, .(day = "Sat", c = mean(casual), r = mean(registered)), hr]) %>%
rbind(hour[weekday==0, .(day = "Sun", c = mean(casual), r = mean(registered)), hr]) %>%
.[, deg := hr*15] %>%
.[, let := LETTERS[hr+1]]
# plot_ly(time, x=~hr, y=~cnt, color = ~day)
p <- plot_ly(type = 'scatterpolar', mode = 'markers',
r = time[day=="Weekday"]$c,
theta = time[day=="Weekday"]$deg,
name = "Casual_Week", marker = list(opacity = 0.5)) %>%
add_trace(type = 'scatterpolar', mode = 'markers',
r = time[day=="Sat"]$c,
theta = time[day=="Sat"]$deg,
name = "Casual_Sat", marker = list(opacity = 0.5)) %>%
add_trace(type = 'scatterpolar', mode = 'markers',
r = time[day=="Sun"]$c,
theta = time[day=="Sun"]$deg,
name = "Casual_Sun", marker = list(opacity = 0.5)) %>%
add_trace(type = 'scatterpolar', mode = 'markers',
r = time[day=="Weekday"]$r,
theta = time[day=="Weekday"]$deg,
name = "Reg_Week",
marker = list(opacity = 0.5)) %>%
add_trace(type = 'scatterpolar', mode = 'markers',
r = time[day=="Sat"]$r,
theta = time[day=="Sat"]$deg,
name = "Reg_Sat",
marker = list(opacity = 0.5)) %>%
add_trace(type = 'scatterpolar', mode = 'markers',
r = time[day=="Sun"]$r,
theta = time[day=="Sun"]$deg,
name = "Reg_Sun",
marker = list(opacity = 0.5)) %>%
layout(polar = list(radialaxis = list(angle = 0),
angularaxis = list(direction = 'clockwise', dtick = 15)))
pday[,.N, .(holiday, workingday)]day[holiday==1,.(dteday, yr, weekday, casual, registered, cnt)]hour[dteday=="2011-07-04",.(hr, weathersit, temp, atemp, hum, windspeed, casual, registered, cnt)]cutoff = "2012-09-01"
# train-test split by time
data.train = hour[dteday<cutoff, -c("instant", "dteday", "casual", "registered")]
week.train = hour[dteday<cutoff] %>% .[, week := week(dteday)] %>%
.[, -c("instant", "dteday", "casual", "registered")]
date.train = hour[dteday<cutoff] %>% .[, week := week(dteday)] %>%
.[, date := mday(dteday)] %>%
.[, -c("instant", "dteday", "casual", "registered")]
cas.train = hour[dteday<cutoff]$casual
reg.train = hour[dteday<cutoff]$registered
data.test = hour[dteday>=cutoff, -c("instant", "dteday", "casual", "registered")]
week.test = hour[dteday>=cutoff] %>% .[, week := week(dteday)] %>%
.[, -c("instant", "dteday", "casual", "registered")]
date.test = hour[dteday>=cutoff] %>% .[, week := week(dteday)] %>%
.[, date := mday(dteday)] %>%
.[, -c("instant", "dteday", "casual", "registered")]
# check to make sure distribution is consistent
FeatureDistribution(list(Train = hour[dteday<cutoff],
Test = hour[dteday>=cutoff]),
featureName = "cnt", periodName = "dteday")hour.ohot = copy(hour)
OneHot(hour.ohot, "mnth")
OneHot(hour.ohot, "hr")
OneHot(hour.ohot, "season")
OneHot(hour.ohot, "weekday")
OneHot(hour.ohot, "weathersit")
ohot.train = hour.ohot[dteday<cutoff, -c("instant", "dteday", "casual", "registered")]
ohot.test = hour.ohot[dteday>=cutoff, -c("instant", "dteday", "casual", "registered")]0/1) column# 10-fold CV
set.seed(seed)
glm.ctr <- trainControl(method = "repeatedcv", number = 10, repeats = 5)
# search grid
glm.grd <- expand.grid(alpha = 0:10/10, lambda = 10^seq(3,-6))
# model
set.seed(seed)
glm.all <- train(x = ohot.train[,-"cnt"], y = ohot.train$cnt,
method = "glmnet", trControl = glm.ctr, tuneGrid = glm.grd)There were missing values in resampled performance measures.
plot(glm.all)glm.all$bestTunepred.glm = data.table(actual = ohot.test$cnt,
pred = predict(glm.all, newdata = ohot.test))
plot_ly(pred.glm, x = ~actual, y = ~pred) %>%
add_markers() %>%
add_lines(x=c(1,800), y=c(1,800), showlegend = F)pred.glm.train = data.table(actual = ohot.train$cnt,
pred = predict(glm.all, newdata = ohot.train))
plot_ly(pred.glm.train, x = ~actual, y = ~pred) %>%
add_markers() %>%
add_lines(x=c(1,800), y=c(1,800), showlegend = F)# model
set.seed(seed)
glm.cas <- train(x = ohot.train[,-"cnt"], y = cas.train,
method = "glmnet", trControl = glm.ctr, tuneGrid = glm.grd)There were missing values in resampled performance measures.
plot(glm.cas)glm.cas$bestTuneset.seed(seed)
glm.reg <- train(x = ohot.train[,-"cnt"], y = reg.train,
method = "glmnet", trControl = glm.ctr, tuneGrid = glm.grd)There were missing values in resampled performance measures.
plot(glm.reg)glm.reg$bestTuneglm.two = data.table(hour[dteday>=cutoff, .(actual = cnt, dteday)],
cas = predict(glm.cas, newdata = data.fe)[14492:17379],
reg = predict(glm.reg, newdata = data.fe)[14492:17379]) %>%
.[, pred := cas + reg]Error in cbind2(1, newx) %*% nbeta :
Cholmod error 'X and/or Y have wrong dimensions' at file ../MatrixOps/cholmod_sdmult.c, line 90
FeatureComparison(list(glm.two$actual, glm.two$pred))FeatureDistribution(list(Train = hour[dteday<cutoff,.(dteday, group = "train", value = cnt)],
Test = glm.two[,.(dteday, group = "actual", value = actual)],
Preds = glm.two[,.(dteday, group = "pred", value = pred)]),
featureName = "value", periodName = "dteday")nrounds (# Boosting Iterations), max_depth (Max Tree Depth), eta (Shrinkage), gamma (Minimum Loss Reduction), colsample_bytree (Subsample Ratio of Columns), min_child_weight (Minimum Sum of Instance Weight), subsample# 10-fold CV
set.seed(seed)
xgb.ctr <- trainControl(method = "repeatedcv", number = 10, repeats = 1)
# search grid
# glm.grd <- expand.grid(alpha = 0:10/10, lambda = 10^seq(3,-6))
# model
set.seed(seed)
xgb.all <- train(x = data.train[,-"cnt"], y = data.train$cnt,
method = "xgbTree", trControl = xgb.ctr)
plot(xgb.all)xgb.all$bestTunepred.xgb = data.table(actual = data.test$cnt,
pred = predict(xgb.all, newdata = data.test))
plot_ly(pred.xgb, x = ~actual, y = ~pred) %>%
add_markers() %>%
add_lines(x=c(1,800), y=c(1,800), showlegend = F)pred.xgb.train = data.table(actual = data.train$cnt,
pred = predict(xgb.all, newdata = data.train))
plot_ly(pred.xgb.train, x = ~actual, y = ~pred) %>%
add_markers() %>%
add_lines(x=c(1,800), y=c(1,800), showlegend = F)set.seed(seed)
xgb.cas <- train(x = data.train[,-"cnt"], y = cas.train,
method = "xgbTree", trControl = xgb.ctr)
plot(xgb.cas)xgb.cas$bestTuneset.seed(seed)
xgb.reg <- train(x = data.train[,-"cnt"], y = reg.train,
method = "xgbTree", trControl = xgb.ctr)
plot(xgb.reg)xgb.reg$bestTunepred.two = data.table(hour[dteday>=cutoff, .(actual = cnt, dteday)],
cas = predict(xgb.cas, newdata = data.test),
reg = predict(xgb.reg, newdata = data.test)) %>%
.[, pred := cas + reg]
plot_ly(pred.two, x = ~actual, y = ~pred) %>%
add_markers() %>%
add_lines(x=c(1,800), y=c(1,800), showlegend = F)FeatureComparison(list(pred.two$actual, pred.two$pred))FeatureDistribution(list(Train = hour[dteday<cutoff,.(dteday, group = "train", value = cnt)],
Test = pred.two[,.(dteday, group = "actual", value = actual)],
Preds = pred.two[,.(dteday, group = "pred", value = pred)]),
featureName = "value", periodName = "dteday")# 10-fold CV
set.seed(seed)
xgb.ctr <- trainControl(method = "repeatedcv", number = 10, repeats = 1)
# search grid
# glm.grd <- expand.grid(alpha = 0:10/10, lambda = 10^seq(3,-6))
# model
set.seed(seed)
xgb.week <- train(x = week.train[,-"cnt"], y = week.train$cnt,
method = "xgbTree", trControl = xgb.ctr)
plot(xgb.week)xgb.week$bestTuneweek.xgb = data.table(hour[dteday>=cutoff, .(actual = cnt, dteday)],
pred = predict(xgb.week, newdata = week.test))
plot_ly(week.xgb, x = ~actual, y = ~pred) %>%
add_markers() %>%
add_lines(x=c(1,800), y=c(1,800), showlegend = F)FeatureDistribution(list(Train = hour[dteday<cutoff,.(dteday, group = "train", value = cnt)],
Test = week.xgb[,.(dteday, group = "actual", value = actual)],
Preds = week.xgb[,.(dteday, group = "pred", value = pred)]),
featureName = "value", periodName = "dteday")set.seed(seed)
xgb.date <- train(x = date.train[,-"cnt"], y = date.train$cnt,
method = "xgbTree", trControl = xgb.ctr)
plot(xgb.date)xgb.date$bestTunedate.xgb = data.table(hour[dteday>=cutoff, .(actual = cnt, dteday)],
pred = predict(xgb.date, newdata = date.test))
plot_ly(date.xgb, x = ~actual, y = ~pred) %>%
add_markers() %>%
add_lines(x=c(1,800), y=c(1,800), showlegend = F)FeatureDistribution(list(Train = hour[dteday<cutoff,.(dteday, group = "train", value = cnt)],
Test = date.xgb[,.(dteday, group = "actual", value = actual)],
Preds = date.xgb[,.(dteday, group = "pred", value = pred)]),
featureName = "value", periodName = "dteday")set.seed(seed)
xgb.mth <- train(x = data.train[,-c("mnth","cnt")], y = data.train$cnt,
method = "xgbTree", trControl = xgb.ctr)
plot(xgb.mth)xgb.mth$bestTunemth.xgb = data.table(hour[dteday>=cutoff, .(actual = cnt, dteday)],
pred = predict(xgb.mth, newdata = data.test))
plot_ly(mth.xgb, x = ~actual, y = ~pred) %>%
add_markers() %>%
add_lines(x=c(1,800), y=c(1,800), showlegend = F)mth.xgb.train = data.table(actual = data.train$cnt,
pred = predict(xgb.mth, newdata = data.train))
plot_ly(mth.xgb.train, x = ~actual, y = ~pred) %>%
add_markers() %>%
add_lines(x=c(1,800), y=c(1,800), showlegend = F)FeatureDistribution(list(Train = hour[dteday<cutoff,.(dteday, group = "train", value = cnt)],
Test = date.xgb[,.(dteday, group = "actual", value = actual)],
NoMth = mth.xgb[,.(dteday, group = "pred", value = pred)],
XGB2 = pred.two[,.(dteday, group = "pred", value = pred)],
Week = week.xgb[,.(dteday, group = "pred", value = pred)],
Date = date.xgb[,.(dteday, group = "pred", value = pred)]),
featureName = "value", periodName = "dteday")#calc.relimpsave.image(file = "bike.RData")
load("bike.RData")